{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "from sklearn.cross_validation import train_test_split, cross_val_score\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read(data_loc='',\n",
    "        file_name='full_docs_2.csv', tokenization='word'):\n",
    "    \"\"\"\n",
    "    Given data type and location, load data\n",
    "    :param data_loc: location of dataset\n",
    "    :param tokenization: mode of tokenization : word, char\n",
    "    :return: (text,label) df_text is the tokenized text, df['l3'] last layer label\n",
    "    \"\"\"\n",
    "    df = pandas.read_csv(data_loc + '/' + file_name)\n",
    "    df = df.sample(frac=1).reset_index(drop=True)\n",
    "    # df_texts = [self.tokenize(text) for text in df.text]\n",
    "    # # df_texts = df.text.apply(self.tokenize)\n",
    "    # # create dictionary\n",
    "    # assert len(df_texts) == len(df['l3']) # l3 is the end level label\n",
    "    # # print(\"finished tokenizing %d data instances\"%len(df['l3']))\n",
    "    # \n",
    "    # df = pandas.DataFrame(list(zip(df_texts, list(df['l3']))))\n",
    "    # df.columns=['text', 'label']\n",
    "    # # return df_texts, df['l3']\n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def clean_str(string):\n",
    "    \"\"\"\n",
    "    Tokenization/string cleaning for dataset\n",
    "    Every dataset is lower cased except\n",
    "    \"\"\"\n",
    "    string = re.sub(r\"\\\\\", \"\", string)\n",
    "    string = re.sub(r\"\\'\", \"\", string)\n",
    "    string = re.sub(r\"\\\"\", \"\", string)\n",
    "    return string.strip().lower()\n",
    "\n",
    "def text_cleaner(text):\n",
    "    text = text.replace(\".\", \"\")\n",
    "    text = text.replace(\"[\", \" \")\n",
    "    text = text.replace(\",\", \" \")\n",
    "    text = text.replace(\"]\", \" \")\n",
    "    text = text.replace(\"(\", \" \")\n",
    "    text = text.replace(\")\", \" \")\n",
    "    text = text.replace(\"\\\"\", \"\")\n",
    "    text = text.replace(\"-\", \"\")\n",
    "    text = text.replace(\"=\", \"\")\n",
    "    rules = [\n",
    "        {r'>\\s+': u'>'},  # remove spaces after a tag opens or closes\n",
    "        {r'\\s+': u' '},  # replace consecutive spaces\n",
    "        {r'\\s*<br\\s*/?>\\s*': u'\\n'},  # newline after a <br>\n",
    "        {r'</(div)\\s*>\\s*': u'\\n'},  # newline after </p> and </div> and <h1/>...\n",
    "        {r'</(p|h\\d)\\s*>\\s*': u'\\n\\n'},  # newline after </p> and </div> and <h1/>...\n",
    "        {r'<head>.*<\\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>\n",
    "        {r'<a\\s+href=\"([^\"]+)\"[^>]*>.*</a>': r'\\1'},  # show links instead of texts\n",
    "        {r'[ \\t]*<[^<]*?/?>': u''},  # remove remaining tags\n",
    "        {r'^\\s+': u''}  # remove spaces at the beginning\n",
    "    ]\n",
    "    for rule in rules:\n",
    "        for (k, v) in rule.items():\n",
    "            regex = re.compile(k)\n",
    "            text = regex.sub(v, text)\n",
    "        text = text.rstrip()\n",
    "        text = text.strip()\n",
    "    return text.lower()\n",
    "\n",
    "class LabelDictionary():\n",
    "    def __init__(self):\n",
    "        self.y2i = {}\n",
    "        self.i2y ={}\n",
    "def create_class_dict(y_labels):\n",
    "    dict = LabelDictionary()\n",
    "    # this function takes labels in L2 and\n",
    "    # transform into range (0,max_classes) for train\n",
    "    unique_labels = set(np.unique(y_labels))\n",
    "    for i,label in enumerate(unique_labels):\n",
    "        dict.y2i[label]=i\n",
    "        dict.i2y[i]=label\n",
    "    return dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_loc = '/home/ml/ksinha4/mlp/hier-class/data'\n",
    "df_train = read(data_loc=data_loc, file_name=\"df_small_train.csv\")\n",
    "df_test = read(data_loc=data_loc, file_name=\"df_small_test.csv\")\n",
    "df_train.text = [clean_str(x) for x in  df_train.text]\n",
    "df_test.text = [clean_str(x) for x in df_test.text]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "l1_x_train, l1_y_train, l1_x_val, l1_y_val = \\\n",
    "        train_test_split(df_train.text, df_train.l1, test_size=0.1, random_state=0)\n",
    "number_of_classes_L1 = len(df_train.l1.unique())\n",
    "l1_dict = create_class_dict(df_train.l1.unique())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'TopicalConcept': 0, 'Event': 1, 'Work': 2, 'SportsSeason': 3, 'Place': 4, 'Species': 5, 'UnitOfWork': 6, 'Agent': 7, 'Device': 8}\n"
     ]
    }
   ],
   "source": [
    "print(l1_dict.y2i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[None, None, None, None, None, None, None, None, None]\n"
     ]
    }
   ],
   "source": [
    "l2_data = [None]*len(l1_dict.y2i)\n",
    "print(l2_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(l1_dict.i2y.keys())):\n",
    "    l2_data[i]=df_train[df_train.l1==l1_dict.i2y[i]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Dataset():\n",
    "    def __init__(self):\n",
    "        self.x_train=None\n",
    "        self.y_train=None\n",
    "        self.x_val=None\n",
    "        self.y_val=None\n",
    "        self.label_dict=None\n",
    "        self.number_of_classes=None\n",
    "        self.childs=None\n",
    "\n",
    "\n",
    "def data_pipline(df_train,level=1,stop_level=3):\n",
    "    d = Dataset()\n",
    "    d.number_of_classes= len(df_train['l%d'%level].unique())\n",
    "    d.dict = create_class_dict(df_train['l%d'%level].unique())\n",
    "    d.x_train, d.x_val, d.y_train, d.y_val = \\\n",
    "            train_test_split(df_train.text, \n",
    "            df_train['l%d'%level].apply(lambda x: d.dict.y2i[x]),\n",
    "                             test_size=0.1, random_state=0)\n",
    "    \n",
    "    children_data = [None]*len(d.dict.y2i)\n",
    "    next_level= 1+level\n",
    "    if stop_level+1 == next_level:\n",
    "        d_childs = None\n",
    "    else:\n",
    "        for i in range(len(d.dict.i2y.keys())):\n",
    "            children_data[i]=data_pipline(df_train[df_train['l%d'%level]\\\n",
    "                                      ==d.dict.i2y[i]],\n",
    "                                          level=next_level)\n",
    "        d.childs = children_data\n",
    "    return d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = data_pipline(df_train,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "177075    5\n32155     7\n182380    5\n125377    5\n173033    5\n57524     4\n243873    4\n73242     7\n117668    7\n183114    2\nName: l1, dtype: int64\n{'Person': 0, 'GridironFootballPlayer': 1, 'OrganisationMember': 2, 'MusicalArtist': 3, 'Actor': 4, 'WinterSportPlayer': 5, 'RacingDriver': 6, 'SportsTeam': 7, 'Athlete': 8, 'SportsLeague': 9, 'Organisation': 10, 'Coach': 11, 'Company': 12, 'Scientist': 13, 'SportsManager': 14, 'ComicsCharacter': 15, 'MotorcycleRider': 16, 'BritishRoyalty': 17, 'Group': 18, 'Writer': 19, 'Broadcaster': 20, 'Politician': 21, 'Artist': 22, 'EducationalInstitution': 23, 'Cleric': 24, 'Wrestler': 25, 'FictionalCharacter': 26, 'VolleyballPlayer': 27, 'Presenter': 28, 'Boxer': 29}\n"
     ]
    }
   ],
   "source": [
    "print(d.y_val[:10])\n",
    "print(d.childs[7].dict.y2i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
